Let \(X \sim \sum_i^k \pi_i \mathcal{N}_i(\mu_i, \sigma^2)\). With \(k = 10\) and \(n = 1000\), \(N = nk\), \(\pi_i = 1/n\).
n <- 2e3
k <- 10
N <- n*k
set.seed(53)
mux <- c(seq(1,5,by=1),8, 10,15, 20, 25)
mu0 <- mux
(sigma1 <- rep(1,10))# [1] 1 1 1 1 1 1 1 1 1 1
Z <- rep(1:k, each = n)Here are the means
plot(mu0, col = 1:10, pch = 19, cex = 3)We now generate the data with
X0 <- data.frame(x1 = sapply(Z, function(x) rnorm(1, mu0[x], sigma1[x])), Z = Z)
head(X0[sample(dim(X0)[1]),])# x1 Z
# 12664 9.943007 7
# 12617 8.381332 7
# 6000 2.230395 3
# 11073 5.135620 6
# 18730 24.113318 10
# 18019 25.842470 10
dat <- X0[, 1]And here are the data colored with the truth.
ggCol <- c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a')
ggplot(data = X0, aes(x=x1, y = Z, ,color = as.factor(Z))) +
scale_color_manual(values = ggCol) +
geom_point(alpha = 0.5)Run with \(jk\)-means with \(j=2, k=10\):
options: max iteration=100 convergence error tolerance = 1E-8
K <- k
j <- 2
set.seed(2^10)
system.time(
jk <- jkmeans::jkmeansEM(as.matrix(dat), K, j , 100, tol = 1E-8)
)
jkv <- apply(jk$zeta, 1, function(x) which( x == max(x) ))system.time({
kv <- kmeans(dat, centers = mu0)
}
)# user system elapsed
# 0.012 0.000 0.012
system.time(
mcv <- Mclust(dat,
G = K,
modelNames="E")
)# user system elapsed
# 0.240 0.012 0.252
mcv# 'Mclust' model object:
# best model: univariate, equal variance (E) with 10 components
# Names ari
# 1 Truth v km 0.5311323
# 2 Truth v jk 0.5494590
# 3 Truth v mclust 0.5613259
Y <- data.frame(X0, kv = kv$cluster,
mc=mcv$classification,
jk=jkv)
p1d <- ggplot(data = Y, aes(x=x1, y=Z)) +
scale_color_manual(values = ggCol) Let \(X \sim \sum_i^k \pi_i \mathcal{N}_i(\mu_i, \sigma^2_i I)\). With \(k = 10\) and \(n = 1000\), \(N = nk\).
n <- 1e3
k <- 10
N <- n*k
set.seed(49)
mux <- sort(sample(-20:20, 10))
muy <- sample(-20:20, 10)
(mu0 <- cbind(mux, muy))# mux muy
# [1,] -19 -10
# [2,] -17 -3
# [3,] -11 -16
# [4,] -10 5
# [5,] -6 16
# [6,] -3 -20
# [7,] -1 2
# [8,] 3 -13
# [9,] 6 -5
# [10,] 18 -6
(sigma1 <- sample(c(1,1,1,2,3,4,5), 10, replace=TRUE))# [1] 1 4 2 4 1 5 5 1 5 1
sigma0 <- lapply(sigma1, function(x) matrix(x*c(1,0,0,1), 2, 2))
Z <- rep(1:k, each = n)Here are the means
plot(mu0, col = 1:10, pch = 19, cex = 3)We now generate the data with
X0 <- data.frame(t(sapply(Z, function(x) rmvnorm(1, mu0[x,], sigma0[[x]]))), Z)
names(X0) <- c('x1', 'x2', 'Z')
head(X0[sample(dim(X0)[1]),])# x1 x2 Z
# 5564 -1.391007 -20.7115938 6
# 3350 -11.039710 4.7961061 4
# 1000 -19.664400 -9.6635771 1
# 8542 5.388799 -0.2730803 9
# 1467 -13.249741 1.0983979 2
# 3507 -11.265466 2.7168030 4
dat <- as.matrix(X0[, 1:2])And here are the data colored with the truth.
ggCol <- c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a')
ggplot(data = X0, aes(x=x1, y=x2, color = as.factor(Z))) +
scale_color_manual(values = ggCol) +
geom_point(alpha = 0.5)Run with \(jk\)-means with \(j=2, k=10\):
options: max iteration=100 convergence error tolerance = 1E-8
K <- k
j <- 2
set.seed(5431)
set.seed(2^10)
system.time(
jk <- jkmeans::jkmeansEM(dat, K, j , 100, tol = 1E-8)
)# user system elapsed
# 0.693 0.004 0.691
jkv <- apply(jk$zeta, 1, function(x) which( x == max(x) ))system.time({
kv <- kmeans(dat, centers = mu0)
}
)# user system elapsed
# 0.002 0.000 0.002
system.time(
mcv <- Mclust(dat,
G = K,
modelNames="EII")
)# user system elapsed
# 171.500 0.697 172.218
mcv# 'Mclust' model object:
# best model: spherical, equal volume (EII) with 10 components
# Names ari
# 1 Truth v km 0.9579922
# 2 Truth v jk 0.8206772
# 3 Truth v mclust 0.9563053
Y <- data.frame(X0, jk=jkv, kv = kv$cluster,
mc=mcv$classification)
p1 <- ggplot(data = Y, aes(x=x1, y=x2)) +
scale_color_manual(values = ggCol)